notebook.community



In [1]:

    
# Install some more libs
! sudo pip install pandas
! sudo pip install matplotlib
! sudo apt-get -y install python3-tk









    



Requirement already satisfied (use --upgrade to upgrade): pandas in /usr/local/lib/python3.4/dist-packages
Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in /usr/local/lib/python3.4/dist-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in /root/.local/lib/python3.4/site-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in /usr/lib/python3/dist-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): six>=1.5 in /usr/local/lib/python3.4/dist-packages (from python-dateutil>=2->pandas)
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
Requirement already satisfied (use --upgrade to upgrade): matplotlib in /usr/local/lib/python3.4/dist-packages
Requirement already satisfied (use --upgrade to upgrade): pyparsing!=2.0.0,!=2.0.4,!=2.1.2,>=1.5.6 in /usr/local/lib/python3.4/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): pytz in /root/.local/lib/python3.4/site-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil in /usr/local/lib/python3.4/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): cycler in /usr/local/lib/python3.4/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.6 in /usr/lib/python3/dist-packages (from matplotlib)
Requirement already satisfied (use --upgrade to upgrade): six>=1.5 in /usr/local/lib/python3.4/dist-packages (from python-dateutil->matplotlib)
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3-tk is already the newest version.
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.



In [2]:

    
# import required libs
from revscoring.dependencies import solve
from revscoring.features import wikitext
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline



In [3]:

    
# Load dataset
df = pd.read_csv("enwiki.draft_quality.75_not_OK_sample.censored.tsv", sep="\t")
df.head()









    Out[3]:






  
    
      
      page_title
      rev_id
      creation_timestamp
      archived
      draft_quality
      censored_text
    
  
  
    
      0
      Government_Achuthan_girls_hss
      688249460
      20151030165831
      1
      spam
      125 years ago APPU NEDUNGADI  STARTED THIS SCH...
    
    
      1
      Spiromax_EDMS
      731688963
      20160726220726
      1
      spam
      Spiromax is a British technology media and com...
    
    
      2
      Steph_Curry_UA_2
      693824273
      20151205035603
      1
      spam
      The Steph Curry UA two Shoes are a hot-selling...
    
    
      3
      Valletta_Cruise_Port
      722151405
      20160526085302
      1
      spam
      ''Valletta Cruise Port plc''' is a private com...
    
    
      4
      RWG_Mobile
      731355950
      20160724204124
      1
      spam
      [[File:RWGmobile.png|thumb|RWG Mobile logo]]\n...

Character features



In [4]:

    
# The number of characters
chars = lambda x:list(solve([ wikitext.revision.chars], cache={'datasource.revision.text': x}))[0]
df["chars"] = df["censored_text"].apply(chars)

# whitespace_chars
whitespace_chars = lambda x:list(solve([ wikitext.revision.whitespace_chars], cache={'datasource.revision.text': x}))[0]
df["whitespace_chars"] = df["censored_text"].apply(whitespace_chars)

# The number of wikitext markup characters
markup_chars = lambda x:list(solve([ wikitext.revision.markup_chars], cache={'datasource.revision.text': x}))[0]
df["markup_chars"] = df["censored_text"].apply(markup_chars)

# The number of Chinese/Japanese/Korean characters
cjk_chars = lambda x:list(solve([ wikitext.revision.cjk_chars], cache={'datasource.revision.text': x}))[0]
df["cjk_chars"] = df["censored_text"].apply(cjk_chars)

# The number of HTML entity characters
entity_chars = lambda x:list(solve([ wikitext.revision.entity_chars], cache={'datasource.revision.text': x}))[0]
df["entity_chars"] = df["censored_text"].apply(entity_chars)

# The number of URL characters
url_chars = lambda x:list(solve([ wikitext.revision.url_chars], cache={'datasource.revision.text': x}))[0]
df["url_chars"] = df["censored_text"].apply(url_chars)

# The number of word characters
word_chars = lambda x:list(solve([ wikitext.revision.word_chars], cache={'datasource.revision.text': x}))[0]
df["word_chars"] = df["censored_text"].apply(word_chars)

# The number of UPPERCASE WORD characters
uppercase_word_chars = lambda x:list(solve([ wikitext.revision.uppercase_word_chars], cache={'datasource.revision.text': x}))[0]
df["uppercase_word_chars"] = df["censored_text"].apply(uppercase_word_chars)

# The number of punctuation characters
punctuation_chars = lambda x:list(solve([ wikitext.revision.punctuation_chars], cache={'datasource.revision.text': x}))[0]
df["punctuation_chars"] = df["censored_text"].apply(punctuation_chars)

# The number of break characters
break_chars = lambda x:list(solve([ wikitext.revision.break_chars], cache={'datasource.revision.text': x}))[0]
df["break_chars"] = df["censored_text"].apply(break_chars)

# The length of the most longest character repetition
longest_repeated_char = lambda x:list(solve([ wikitext.revision.longest_repeated_char], cache={'datasource.revision.text': x}))[0]
df["longest_repeated_char"] = df["censored_text"].apply(longest_repeated_char)

Tokenized features



In [5]:

    
# The number of tokens
tokens = lambda x:list(solve([ wikitext.revision.tokens], cache={'datasource.revision.text': x}))[0]
df["tokens"] = df["censored_text"].apply(tokens)

# The number of number tokens
numbers = lambda x:list(solve([ wikitext.revision.numbers], cache={'datasource.revision.text': x}))[0]
df["numbers"] = df["censored_text"].apply(numbers)

# The number of whitespace tokens
whitespaces = lambda x:list(solve([ wikitext.revision.whitespaces], cache={'datasource.revision.text': x}))[0]
df["whitespaces"] = df["censored_text"].apply(whitespaces)

# The number of markup tokens
markups = lambda x:list(solve([ wikitext.revision.markups], cache={'datasource.revision.text': x}))[0]
df["markups"] = df["censored_text"].apply(markups)

# The number of Chinese/Japanese/Korean tokens
cjks = lambda x:list(solve([ wikitext.revision.cjks], cache={'datasource.revision.text': x}))[0]
df["cjks"] = df["censored_text"].apply(cjks)

# The number of HTML entity tokens
entities = lambda x:list(solve([ wikitext.revision.entities], cache={'datasource.revision.text': x}))[0]
df["entities"] = df["censored_text"].apply(entities)

# The number of URL tokens
urls = lambda x:list(solve([ wikitext.revision.urls], cache={'datasource.revision.text': x}))[0]
df["urls"] = df["censored_text"].apply(urls)

# The number of word tokens
words = lambda x:list(solve([ wikitext.revision.words], cache={'datasource.revision.text': x}))[0]
df["words"] = df["censored_text"].apply(words)

# The number of UPPERCASE word tokens
uppercase_words = lambda x:list(solve([ wikitext.revision.uppercase_words], cache={'datasource.revision.text': x}))[0]
df["uppercase_words"] = df["censored_text"].apply(uppercase_words)

# The number of punctuation tokens
punctuations = lambda x:list(solve([ wikitext.revision.punctuations], cache={'datasource.revision.text': x}))[0]
df["punctuations"] = df["censored_text"].apply(punctuations)

# The number of break tokens
breaks = lambda x:list(solve([ wikitext.revision.breaks], cache={'datasource.revision.text': x}))[0]
df["breaks"] = df["censored_text"].apply(breaks)

# The length of the longest token
longest_token = lambda x:list(solve([ wikitext.revision.longest_token], cache={'datasource.revision.text': x}))[0]
df["longest_token"] = df["censored_text"].apply(longest_token)

# The length of the longest word-token
longest_word = lambda x:list(solve([ wikitext.revision.longest_word], cache={'datasource.revision.text': x}))[0]
df["longest_word"] = df["censored_text"].apply(longest_word)

Parsed features



In [6]:

    
# The number of characters of viewable content (no markup or templates)
content_chars = lambda x:list(solve([ wikitext.revision.content_chars], cache={'datasource.revision.text': x}))[0]
df["content_chars"] = df["censored_text"].apply(content_chars)

# The number of headings
headings = lambda x:list(solve([ wikitext.revision.headings], cache={'datasource.revision.text': x}))[0]
df["headings"] = df["censored_text"].apply(headings)

# The number of external links
external_links = lambda x:list(solve([ wikitext.revision.external_links], cache={'datasource.revision.text': x}))[0]
df["external_links"] = df["censored_text"].apply(external_links)

# The number of wikilinks (internal to other pages in the wiki)
wikilinks = lambda x:list(solve([ wikitext.revision.wikilinks], cache={'datasource.revision.text': x}))[0]
df["wikilinks"] = df["censored_text"].apply(wikilinks)

# The number of HTML tags
tags = lambda x:list(solve([ wikitext.revision.tags], cache={'datasource.revision.text': x}))[0]
df["tags"] = df["censored_text"].apply(tags)

# The number of <ref> tags
ref_tags = lambda x:list(solve([ wikitext.revision.ref_tags], cache={'datasource.revision.text': x}))[0]
df["ref_tags"] = df["censored_text"].apply(ref_tags)

# The number of templates
templates = lambda x:list(solve([ wikitext.revision.templates], cache={'datasource.revision.text': x}))[0]
df["templates"] = df["censored_text"].apply(templates)



In [ ]:

Custom features

Same features as above but we consider frequencies instead of numbers



In [7]:

    
df["whitespace_chars_norm"] = df["whitespace_chars"] / df["chars"]
df["markup_chars_norm"] = df["markup_chars"] / df["chars"]
df["cjk_chars_norm"] = df["cjk_chars"] / df["chars"]
df["entity_chars_norm"] = df["entity_chars"] / df["chars"]
df["url_chars_norm"] = df["url_chars"] / df["chars"]
df["word_chars_norm"] = df["word_chars"] / df["chars"]
df["uppercase_word_chars_norm"] = df["uppercase_word_chars"] / df["chars"]
df["punctuation_chars_norm"] = df["punctuation_chars"] / df["chars"]
df["break_chars_norm"] = df["break_chars"] / df["chars"]
df["longest_repeated_char_norm"] = df["longest_repeated_char"] / df["chars"]



In [8]:

    
df["numbers_norm"] = df["numbers"] / df["tokens"]
df["whitespaces_norm"] = df["whitespaces"] / df["tokens"]
df["markups_norm"] = df["markups"] / df["tokens"]
df["cjks_norm"] = df["cjks"] / df["tokens"]
df["entities_norm"] = df["entities"] / df["tokens"]
df["urls_norm"] = df["urls"] / df["tokens"]
df["words_norm"] = df["words"] / df["tokens"]
df["uppercase_words_norm"] = df["uppercase_words"] / df["tokens"]
df["punctuations_norm"] = df["punctuations"] / df["tokens"]
df["breaks_norm"] = df["breaks"] / df["tokens"]
df["longest_token_norm"] = df["longest_token"] / df["tokens"]



In [9]:

    
### Recap the columns in the main dataframe
df.columns









    Out[9]:





Index(['page_title', 'rev_id', 'creation_timestamp', 'archived',
       'draft_quality', 'censored_text', 'chars', 'whitespace_chars',
       'markup_chars', 'cjk_chars', 'entity_chars', 'url_chars', 'word_chars',
       'uppercase_word_chars', 'punctuation_chars', 'break_chars',
       'longest_repeated_char', 'tokens', 'numbers', 'whitespaces', 'markups',
       'cjks', 'entities', 'urls', 'words', 'uppercase_words', 'punctuations',
       'breaks', 'longest_token', 'longest_word', 'content_chars', 'headings',
       'external_links', 'wikilinks', 'tags', 'ref_tags', 'templates',
       'whitespace_chars_norm', 'markup_chars_norm', 'cjk_chars_norm',
       'entity_chars_norm', 'url_chars_norm', 'word_chars_norm',
       'uppercase_word_chars_norm', 'punctuation_chars_norm',
       'break_chars_norm', 'longest_repeated_char_norm', 'numbers_norm',
       'whitespaces_norm', 'markups_norm', 'cjks_norm', 'entities_norm',
       'urls_norm', 'words_norm', 'uppercase_words_norm', 'punctuations_norm',
       'breaks_norm', 'longest_token_norm'],
      dtype='object')



In [ ]:

Feature selection



In [ ]:



In [10]:

    
### We consider only the features we've defined above 
features = df.columns[6:] 
### We consider only the features we've defined above 
target = df.columns[4]



In [11]:

    
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 4 attributes
rfe = RFE(model, 4)
rfe = rfe.fit(df[features], df[target])
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
features[rfe.support_]









    



[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False False  True False False False False False False
 False False False False  True False  True False False False False False
 False False False False]
[28 22 19 29 43 27 24 25  7 47  8 26 17 20 11 33 44  4 21  5  6 45 31 13 35
 49  1 10 18  1  9 12 16 40 48 32  3 34 30 46  1 36  1 15 39 42 38  2 37 23
 41 14]






    Out[11]:





Index(['external_links', 'ref_tags', 'longest_repeated_char_norm',
       'whitespaces_norm'],
      dtype='object')



In [12]:

    
rfe.score(df[features], df[target])









    Out[12]:





0.54666666666666663



In [13]:

    
# Feature Importance
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(df[features], df[target])
# display the relative importance of each attribute
print(model.feature_importances_)









    



[ 0.02242708  0.02013383  0.00327857  0.003       0.          0.01405852
  0.04051879  0.0288158   0.02686494  0.          0.01993469  0.02514189
  0.02579609  0.02749638  0.00461111  0.00209474  0.          0.01846405
  0.00969322  0.0211931   0.03594521  0.          0.03385794  0.04222701
  0.03490241  0.          0.00452698  0.00667216  0.02981208  0.00732097
  0.01397508  0.04622916  0.03295663  0.          0.          0.02118819
  0.0175171   0.02494762  0.03371686  0.          0.03176076  0.02065031
  0.06273696  0.02446866  0.          0.          0.02926535  0.03336858
  0.020728    0.05009944  0.          0.02760372]



In [14]:

    
model.score(df[features], df[target])









    Out[14]:





1.0



In [15]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.08, random_state=0)



In [16]:

    
model2 = ExtraTreesClassifier()
model2.fit(X_train, y_train)









    Out[16]:





ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)



In [17]:

    
model2.score(X_test,y_test)









    Out[17]:





0.66666666666666663



In [18]:

    
y_test









    Out[18]:





52    vandalism
57    vandalism
22         spam
74    vandalism
28       attack
26       attack
Name: draft_quality, dtype: object



In [19]:

    
df.describe()









    Out[19]:






  
    
      
      rev_id
      creation_timestamp
      archived
      chars
      whitespace_chars
      markup_chars
      cjk_chars
      entity_chars
      url_chars
      word_chars
      ...
      whitespaces_norm
      markups_norm
      cjks_norm
      entities_norm
      urls_norm
      words_norm
      uppercase_words_norm
      punctuations_norm
      breaks_norm
      longest_token_norm
    
  
  
    
      count
      7.500000e+01
      7.500000e+01
      75.0
      75.000000
      75.000000
      75.000000
      75.000000
      75.0
      75.000000
      75.000000
      ...
      75.000000
      75.000000
      75.000000
      75.0
      75.000000
      75.000000
      75.000000
      75.000000
      75.0
      75.000000
    
    
      mean
      7.055759e+08
      2.015705e+13
      1.0
      2311.813333
      331.066667
      49.960000
      3.533333
      0.0
      167.786667
      1634.026667
      ...
      0.396819
      0.025265
      0.006040
      0.0
      0.002316
      0.454844
      0.013982
      0.042309
      0.0
      0.205263
    
    
      std
      1.687496e+07
      4.548500e+09
      0.0
      5458.414872
      812.644953
      225.294908
      30.599564
      0.0
      768.499737
      3945.208764
      ...
      0.097645
      0.061800
      0.052307
      0.0
      0.004710
      0.081163
      0.043556
      0.026546
      0.0
      0.250185
    
    
      min
      6.770522e+08
      2.015082e+13
      1.0
      1.000000
      0.000000
      0.000000
      0.000000
      0.0
      0.000000
      1.000000
      ...
      0.000000
      0.000000
      0.000000
      0.0
      0.000000
      0.210256
      0.000000
      0.000000
      0.0
      0.001913
    
    
      25%
      6.905391e+08
      2.015111e+13
      1.0
      168.000000
      29.500000
      0.000000
      0.000000
      0.0
      0.000000
      126.500000
      ...
      0.384018
      0.000000
      0.000000
      0.0
      0.000000
      0.430794
      0.000000
      0.023999
      0.0
      0.042273
    
    
      50%
      7.073848e+08
      2.016023e+13
      1.0
      631.000000
      93.000000
      0.000000
      0.000000
      0.0
      0.000000
      453.000000
      ...
      0.419355
      0.000000
      0.000000
      0.0
      0.000000
      0.456989
      0.003135
      0.042503
      0.0
      0.114754
    
    
      75%
      7.215654e+08
      2.016052e+13
      1.0
      1715.000000
      253.000000
      12.000000
      0.000000
      0.0
      43.000000
      1167.500000
      ...
      0.453685
      0.020739
      0.000000
      0.0
      0.003194
      0.474541
      0.011230
      0.056349
      0.0
      0.250193
    
    
      max
      7.318231e+08
      2.016073e+13
      1.0
      36704.000000
      6167.000000
      1619.000000
      265.000000
      0.0
      6461.000000
      29072.000000
      ...
      0.499863
      0.400000
      0.452991
      0.0
      0.022727
      1.000000
      0.340909
      0.117647
      0.0
      1.113636
    
  

8 rows × 55 columns



In [20]:

    
df.boxplot(by='draft_quality', column=['external_links', 'ref_tags', 'whitespaces_norm', 'longest_repeated_char_norm'], figsize=(15,15))









    Out[20]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f31c8d50710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f31c8c7db38>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f31c8c88828>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f31c6be92b0>]], dtype=object)



In [21]:

    
X = df[features]
y = df[target]
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()









    



Feature ranking:
1. feature 49 (0.048787)
2. feature 31 (0.045785)
3. feature 42 (0.040480)
4. feature 38 (0.033654)
5. feature 41 (0.033084)
6. feature 12 (0.033045)
7. feature 8 (0.032515)
8. feature 24 (0.031702)
9. feature 22 (0.031495)
10. feature 35 (0.030968)
11. feature 20 (0.029690)
12. feature 7 (0.028881)
13. feature 46 (0.028815)
14. feature 40 (0.028371)
15. feature 36 (0.027828)
16. feature 23 (0.027645)
17. feature 0 (0.027449)
18. feature 19 (0.026808)
19. feature 6 (0.026189)
20. feature 13 (0.025778)
21. feature 47 (0.025613)
22. feature 51 (0.025340)
23. feature 11 (0.024404)
24. feature 48 (0.023871)
25. feature 18 (0.023631)
26. feature 10 (0.023285)
27. feature 1 (0.023002)
28. feature 37 (0.021291)
29. feature 17 (0.020924)
30. feature 32 (0.020282)
31. feature 28 (0.019252)
32. feature 5 (0.018532)
33. feature 26 (0.018427)
34. feature 43 (0.017161)
35. feature 14 (0.014858)
36. feature 2 (0.011967)
37. feature 29 (0.009797)
38. feature 27 (0.008311)
39. feature 30 (0.007093)
40. feature 15 (0.001569)
41. feature 33 (0.000889)
42. feature 3 (0.000767)
43. feature 44 (0.000766)
44. feature 39 (0.000000)
45. feature 16 (0.000000)
46. feature 34 (0.000000)
47. feature 21 (0.000000)
48. feature 9 (0.000000)
49. feature 45 (0.000000)
50. feature 4 (0.000000)
51. feature 50 (0.000000)
52. feature 25 (0.000000)



In [22]:

    
forest.score(X,y)









    Out[22]:





1.0

Univariate Selection



In [23]:

    
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
modelKbest = SelectKBest(chi2, k=6)
fit=modelKbest.fit(df[features], df[target])
newFeatures = fit.transform(df[features])
print(fit.scores_)
#print(newFeatures[0:5,:])
print(fit.get_support())
features[fit.get_support()]









    



[  1.21181914e+04   3.43480387e+02   1.66958687e+03   5.30000000e+02
              nan   1.69586631e+04   4.24289131e+03   5.75900990e+02
   4.58051635e+01              nan   2.86614173e+00   1.75225663e+03
   1.27789474e+02   2.86531485e+02   9.43257957e+02   5.30000000e+02
              nan   2.47869565e+02   4.95796198e+02   1.35363344e+02
   3.68792373e+01              nan   1.26442403e+03   1.47293869e+01
   2.54033536e+03              nan   2.59559322e+02   4.69588629e+02
   1.17694444e+02   1.40971429e+02   9.92352941e+01   5.60449245e-02
   7.10438701e-01   4.66960352e-01              nan   1.66404708e+00
   2.59905456e-02   2.20812921e-01   1.87301396e-01              nan
   1.83804595e+00   9.40823038e-02   1.61861986e-01   5.91254042e-01
   9.05982906e-01              nan   1.17471543e-01   4.18547839e-02
   2.82942520e-01   8.80110451e-02              nan   3.88146133e-01]
[ True False  True False False  True  True False False False False  True
 False False False False False False False False False False False False
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]






    



/root/.local/lib/python3.4/site-packages/sklearn/feature_selection/univariate_selection.py:165: RuntimeWarning: invalid value encountered in true_divide
  chisq /= f_exp






    Out[23]:





Index(['chars', 'markup_chars', 'url_chars', 'word_chars', 'tokens',
       'content_chars'],
      dtype='object')

PCA



In [28]:

    
from sklearn.decomposition import PCA
from sklearn import preprocessing
newf=preprocessing.scale(df[features]) #normalisation
pca = PCA(n_components=10)
pca.fit(newf)
print(pca.explained_variance_ratio_)









    



[ 0.31947328  0.13826693  0.10669993  0.09154999  0.07189913  0.058532
  0.04216894  0.03833435  0.02857489  0.02313594]



In [25]:



In [ ]:

	page_title	rev_id	creation_timestamp	archived	draft_quality	censored_text
0	Government_Achuthan_girls_hss	688249460	20151030165831	1	spam	125 years ago APPU NEDUNGADI STARTED THIS SCH...
1	Spiromax_EDMS	731688963	20160726220726	1	spam	Spiromax is a British technology media and com...
2	Steph_Curry_UA_2	693824273	20151205035603	1	spam	The Steph Curry UA two Shoes are a hot-selling...
3	Valletta_Cruise_Port	722151405	20160526085302	1	spam	''Valletta Cruise Port plc''' is a private com...
4	RWG_Mobile	731355950	20160724204124	1	spam	[[File:RWGmobile.png\|thumb\|RWG Mobile logo]]\n...

	rev_id	creation_timestamp	archived	chars	whitespace_chars	markup_chars	cjk_chars	entity_chars	url_chars	word_chars	...	whitespaces_norm	markups_norm	cjks_norm	entities_norm	urls_norm	words_norm	uppercase_words_norm	punctuations_norm	breaks_norm	longest_token_norm
count	7.500000e+01	7.500000e+01	75.0	75.000000	75.000000	75.000000	75.000000	75.0	75.000000	75.000000	...	75.000000	75.000000	75.000000	75.0	75.000000	75.000000	75.000000	75.000000	75.0	75.000000
mean	7.055759e+08	2.015705e+13	1.0	2311.813333	331.066667	49.960000	3.533333	0.0	167.786667	1634.026667	...	0.396819	0.025265	0.006040	0.0	0.002316	0.454844	0.013982	0.042309	0.0	0.205263
std	1.687496e+07	4.548500e+09	0.0	5458.414872	812.644953	225.294908	30.599564	0.0	768.499737	3945.208764	...	0.097645	0.061800	0.052307	0.0	0.004710	0.081163	0.043556	0.026546	0.0	0.250185
min	6.770522e+08	2.015082e+13	1.0	1.000000	0.000000	0.000000	0.000000	0.0	0.000000	1.000000	...	0.000000	0.000000	0.000000	0.0	0.000000	0.210256	0.000000	0.000000	0.0	0.001913
25%	6.905391e+08	2.015111e+13	1.0	168.000000	29.500000	0.000000	0.000000	0.0	0.000000	126.500000	...	0.384018	0.000000	0.000000	0.0	0.000000	0.430794	0.000000	0.023999	0.0	0.042273
50%	7.073848e+08	2.016023e+13	1.0	631.000000	93.000000	0.000000	0.000000	0.0	0.000000	453.000000	...	0.419355	0.000000	0.000000	0.0	0.000000	0.456989	0.003135	0.042503	0.0	0.114754
75%	7.215654e+08	2.016052e+13	1.0	1715.000000	253.000000	12.000000	0.000000	0.0	43.000000	1167.500000	...	0.453685	0.020739	0.000000	0.0	0.003194	0.474541	0.011230	0.056349	0.0	0.250193
max	7.318231e+08	2.016073e+13	1.0	36704.000000	6167.000000	1619.000000	265.000000	0.0	6461.000000	29072.000000	...	0.499863	0.400000	0.452991	0.0	0.022727	1.000000	0.340909	0.117647	0.0	1.113636